{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Final Prediction: LightGBM"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Train a GBM using K-fold CV and use the mean test prediction across the folds for the final submission."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This utility package imports `numpy`, `pandas`, `matplotlib` and a helper `kg` module into the root namespace."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pygoose import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import datetime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.1) compiler.\n",
      "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n",
      "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n",
      "You can install the OpenMP library by the following command: ``brew install libomp``.\n",
      "  \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n"
     ]
    }
   ],
   "source": [
    "import lightgbm as lgb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import StratifiedKFold"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Config"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Automatically discover the paths to various data folders and compose the project structure."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "project = kg.Project.discover()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Number of CV folds."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "NUM_FOLDS = 5"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Make subsequent runs reproducible."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "RANDOM_SEED = 2017"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(RANDOM_SEED)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load all features we extracted earlier."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_lists = [\n",
    "    'simple_summaries',\n",
    "    'jaccard_ngrams',\n",
    "    'fuzzy',\n",
    "    'tfidf',\n",
    "    #'lda',\n",
    "    'nlp_tags',\n",
    "    'wordnet_similarity',\n",
    "    'phrase_embedding',\n",
    "    'wmd',\n",
    "    'wm_intersect',\n",
    "    \n",
    "    '3rdparty_abhishek',\n",
    "    '3rdparty_dasolmar_whq',\n",
    "    '3rdparty_mephistopheies',\n",
    "    '3rdparty_image_similarity',\n",
    "    \n",
    "    'magic_pagerank',\n",
    "    'magic_frequencies',\n",
    "    #'magic_cooccurrence_matrix',\n",
    "    'magic_cooccurrence_matrix_raw',\n",
    "    \n",
    "    'oofp_nn_mlp_with_magic',\n",
    "    'oofp_nn_cnn_with_magic',\n",
    "    'oofp_nn_bi_lstm_with_magic',\n",
    "    'oofp_nn_siamese_lstm_attention',\n",
    "    \n",
    "    'wordmatchshare',\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df_train, df_test, feature_list_ix = project.load_feature_lists(feature_lists)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "remove nan/inf features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "remove_invalid_feats = False\n",
    "if remove_invalid_feats:\n",
    "    df_train = df_train.replace([np.inf, -np.inf], np.nan)\n",
    "    df_test = df_test.replace([np.inf, -np.inf], np.nan)\n",
    "    df_train.dropna(axis=1, inplace=True) \n",
    "    df_test.dropna(axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = df_train.values\n",
    "X_test = df_test.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_train = kg.io.load(project.features_dir + 'y_train.pickle')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "View feature summary."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "X train: (404290, 182)\n",
      "X test:  (2345796, 182)\n",
      "y train: (404290,)\n"
     ]
    }
   ],
   "source": [
    "print('X train:', X_train.shape)\n",
    "print('X test: ', X_test.shape)\n",
    "print('y train:', y_train.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>feature_list</th>\n",
       "      <th>start_index</th>\n",
       "      <th>end_index</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>simple_summaries</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>jaccard_ngrams</td>\n",
       "      <td>9</td>\n",
       "      <td>23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>fuzzy</td>\n",
       "      <td>24</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>tfidf</td>\n",
       "      <td>31</td>\n",
       "      <td>32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>nlp_tags</td>\n",
       "      <td>33</td>\n",
       "      <td>68</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>wordnet_similarity</td>\n",
       "      <td>69</td>\n",
       "      <td>70</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>phrase_embedding</td>\n",
       "      <td>71</td>\n",
       "      <td>76</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>wmd</td>\n",
       "      <td>77</td>\n",
       "      <td>77</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>wm_intersect</td>\n",
       "      <td>78</td>\n",
       "      <td>79</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>3rdparty_abhishek</td>\n",
       "      <td>80</td>\n",
       "      <td>95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>3rdparty_dasolmar_whq</td>\n",
       "      <td>96</td>\n",
       "      <td>144</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>3rdparty_mephistopheies</td>\n",
       "      <td>145</td>\n",
       "      <td>166</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>3rdparty_image_similarity</td>\n",
       "      <td>167</td>\n",
       "      <td>167</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>magic_pagerank</td>\n",
       "      <td>168</td>\n",
       "      <td>169</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>magic_frequencies</td>\n",
       "      <td>170</td>\n",
       "      <td>173</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>magic_cooccurrence_matrix_raw</td>\n",
       "      <td>174</td>\n",
       "      <td>175</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>oofp_nn_mlp_with_magic</td>\n",
       "      <td>176</td>\n",
       "      <td>176</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>oofp_nn_cnn_with_magic</td>\n",
       "      <td>177</td>\n",
       "      <td>177</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>oofp_nn_bi_lstm_with_magic</td>\n",
       "      <td>178</td>\n",
       "      <td>178</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>oofp_nn_siamese_lstm_attention</td>\n",
       "      <td>179</td>\n",
       "      <td>179</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>wordmatchshare</td>\n",
       "      <td>180</td>\n",
       "      <td>181</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                      feature_list  start_index  end_index\n",
       "0                 simple_summaries            0          8\n",
       "1                   jaccard_ngrams            9         23\n",
       "2                            fuzzy           24         30\n",
       "3                            tfidf           31         32\n",
       "4                         nlp_tags           33         68\n",
       "5               wordnet_similarity           69         70\n",
       "6                 phrase_embedding           71         76\n",
       "7                              wmd           77         77\n",
       "8                     wm_intersect           78         79\n",
       "9                3rdparty_abhishek           80         95\n",
       "10           3rdparty_dasolmar_whq           96        144\n",
       "11         3rdparty_mephistopheies          145        166\n",
       "12       3rdparty_image_similarity          167        167\n",
       "13                  magic_pagerank          168        169\n",
       "14               magic_frequencies          170        173\n",
       "15   magic_cooccurrence_matrix_raw          174        175\n",
       "16          oofp_nn_mlp_with_magic          176        176\n",
       "17          oofp_nn_cnn_with_magic          177        177\n",
       "18      oofp_nn_bi_lstm_with_magic          178        178\n",
       "19  oofp_nn_siamese_lstm_attention          179        179\n",
       "20                  wordmatchshare          180        181"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(feature_list_ix, columns=['feature_list', 'start_index', 'end_index'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train models & compute test predictions from each fold"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Calculate partitions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "kfold = StratifiedKFold(\n",
    "    n_splits=NUM_FOLDS,\n",
    "    shuffle=True,\n",
    "    random_state=RANDOM_SEED\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_test_pred = np.zeros((len(X_test), NUM_FOLDS))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Fit all folds."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "cv_scores = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting fold 1 of 5\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/lightgbm/engine.py:116: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n",
      "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n",
      "/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/lightgbm/engine.py:121: UserWarning: Found `early_stopping_rounds` in params. Will use it instead of argument\n",
      "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fold 1: 2350 rounds, training loss 0.136952, validation loss 0.192417\n",
      "\n",
      "Fitting fold 2 of 5\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/lightgbm/engine.py:116: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n",
      "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n",
      "/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/lightgbm/engine.py:121: UserWarning: Found `early_stopping_rounds` in params. Will use it instead of argument\n",
      "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fold 2: 2666 rounds, training loss 0.132270, validation loss 0.189973\n",
      "\n",
      "Fitting fold 3 of 5\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/lightgbm/engine.py:116: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n",
      "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n",
      "/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/lightgbm/engine.py:121: UserWarning: Found `early_stopping_rounds` in params. Will use it instead of argument\n",
      "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fold 3: 2677 rounds, training loss 0.132104, validation loss 0.191148\n",
      "\n",
      "Fitting fold 4 of 5\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/lightgbm/engine.py:116: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n",
      "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n",
      "/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/lightgbm/engine.py:121: UserWarning: Found `early_stopping_rounds` in params. Will use it instead of argument\n",
      "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fold 4: 2853 rounds, training loss 0.128992, validation loss 0.191289\n",
      "\n",
      "Fitting fold 5 of 5\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/lightgbm/engine.py:116: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n",
      "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n",
      "/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/lightgbm/engine.py:121: UserWarning: Found `early_stopping_rounds` in params. Will use it instead of argument\n",
      "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fold 5: 3194 rounds, training loss 0.125750, validation loss 0.184900\n",
      "\n",
      "CPU times: user 8h 42min 47s, sys: 11min 58s, total: 8h 54min 45s\n",
      "Wall time: 3h 17min 32s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train, y_train)):\n",
    "    print(f'Fitting fold {fold_num + 1} of {kfold.n_splits}')\n",
    "    \n",
    "    X_fold_train = X_train[ix_train]\n",
    "    X_fold_val = X_train[ix_val]\n",
    "\n",
    "    y_fold_train = y_train[ix_train]\n",
    "    y_fold_val = y_train[ix_val]\n",
    "    \n",
    "    lgb_params = {\n",
    "        'objective': 'binary',\n",
    "        'metric': 'binary_logloss',\n",
    "        'boosting': 'gbdt',\n",
    "        'device': 'cpu',\n",
    "        'feature_fraction': 0.486,\n",
    "        'num_leaves': 158,\n",
    "        'lambda_l2': 50,\n",
    "        'learning_rate': 0.01,\n",
    "        'num_boost_round': 5000,\n",
    "        'early_stopping_rounds': 10,\n",
    "        'verbose': 1,\n",
    "        'bagging_fraction_seed': RANDOM_SEED,\n",
    "        'feature_fraction_seed': RANDOM_SEED,\n",
    "    }\n",
    "    \n",
    "    lgb_data_train = lgb.Dataset(X_fold_train, y_fold_train)\n",
    "    lgb_data_val = lgb.Dataset(X_fold_val, y_fold_val)    \n",
    "    evals_result = {}\n",
    "    \n",
    "    model = lgb.train(\n",
    "        lgb_params,\n",
    "        lgb_data_train,\n",
    "        valid_sets=[lgb_data_train, lgb_data_val],\n",
    "        evals_result=evals_result,\n",
    "        num_boost_round=lgb_params['num_boost_round'],\n",
    "        early_stopping_rounds=lgb_params['early_stopping_rounds'],\n",
    "        verbose_eval=False,\n",
    "    )\n",
    "    \n",
    "    fold_train_scores = evals_result['training'][lgb_params['metric']]\n",
    "    fold_val_scores = evals_result['valid_1'][lgb_params['metric']]\n",
    "    \n",
    "    print('Fold {}: {} rounds, training loss {:.6f}, validation loss {:.6f}'.format(\n",
    "        fold_num + 1,\n",
    "        len(fold_train_scores),\n",
    "        fold_train_scores[-1],\n",
    "        fold_val_scores[-1],\n",
    "    ))\n",
    "    print()\n",
    "    \n",
    "    cv_scores.append(fold_val_scores[-1])\n",
    "    y_test_pred[:, fold_num] = model.predict(X_test).reshape(-1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Print CV score and feature importance."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>column</th>\n",
       "      <th>importance</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>169</th>\n",
       "      <td>pagerank_q2</td>\n",
       "      <td>8535</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>177</th>\n",
       "      <td>oofp_nn_cnn_with_magic</td>\n",
       "      <td>8472</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>168</th>\n",
       "      <td>pagerank_q1</td>\n",
       "      <td>8085</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>176</th>\n",
       "      <td>oofp_nn_mlp_with_magic</td>\n",
       "      <td>7991</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>179</th>\n",
       "      <td>oofp_nn_siamese_lstm_attention</td>\n",
       "      <td>7851</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>178</th>\n",
       "      <td>oofp_nn_bi_lstm_with_magic</td>\n",
       "      <td>7512</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>162</th>\n",
       "      <td>meph_m_q1_q2_tf_oof</td>\n",
       "      <td>7487</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>167</th>\n",
       "      <td>image_similarity</td>\n",
       "      <td>7450</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>164</th>\n",
       "      <td>meph_m_q1_q2_tf_svd1</td>\n",
       "      <td>7022</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>166</th>\n",
       "      <td>meph_m_diff_q1_q2_tf_oof</td>\n",
       "      <td>6829</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>163</th>\n",
       "      <td>meph_m_q1_q2_tf_svd0</td>\n",
       "      <td>6710</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>165</th>\n",
       "      <td>meph_m_q1_q2_tf_svd100_oof</td>\n",
       "      <td>6597</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "      <td>das_cosine</td>\n",
       "      <td>6476</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>jaccard_ix_diff_3_4</td>\n",
       "      <td>6436</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70</th>\n",
       "      <td>wordnet_similarity_brown</td>\n",
       "      <td>6400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>jaccard_ix_diff_4_5</td>\n",
       "      <td>6394</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>95</th>\n",
       "      <td>abh_kur_q2vec</td>\n",
       "      <td>6375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>98</th>\n",
       "      <td>das_tfidf_word_match</td>\n",
       "      <td>6263</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>93</th>\n",
       "      <td>abh_skew_q2vec</td>\n",
       "      <td>6076</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>119</th>\n",
       "      <td>das_avg_word_len2</td>\n",
       "      <td>6027</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>120</th>\n",
       "      <td>das_diff_avg_word</td>\n",
       "      <td>6013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>94</th>\n",
       "      <td>abh_kur_q1vec</td>\n",
       "      <td>5933</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>jaccard_ix_diff_2_3</td>\n",
       "      <td>5890</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65</th>\n",
       "      <td>pos_tag_cosine</td>\n",
       "      <td>5876</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>118</th>\n",
       "      <td>das_avg_word_len1</td>\n",
       "      <td>5823</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84</th>\n",
       "      <td>abh_norm_wmd</td>\n",
       "      <td>5755</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>92</th>\n",
       "      <td>abh_skew_q1vec</td>\n",
       "      <td>5676</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>158</th>\n",
       "      <td>meph_trigram_tfidf_cosine</td>\n",
       "      <td>5602</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>96</th>\n",
       "      <td>das_word_match</td>\n",
       "      <td>5468</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>69</th>\n",
       "      <td>wordnet_similarity_raw</td>\n",
       "      <td>5310</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>159</th>\n",
       "      <td>meph_trigram_tfidf_l2_euclidean</td>\n",
       "      <td>5257</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>das_diff_stops_r</td>\n",
       "      <td>5214</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>160</th>\n",
       "      <td>meph_trigram_tfidf_l1_euclidean</td>\n",
       "      <td>5173</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>83</th>\n",
       "      <td>abh_wmd</td>\n",
       "      <td>4991</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td>das_shared_2gram</td>\n",
       "      <td>4856</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>181</th>\n",
       "      <td>wordmatchshare_tfidf</td>\n",
       "      <td>4782</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>tfidf_cosine</td>\n",
       "      <td>4765</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149</th>\n",
       "      <td>meph_unigram_jaccard</td>\n",
       "      <td>4713</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>101</th>\n",
       "      <td>das_stops2_ratio</td>\n",
       "      <td>4527</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>79</th>\n",
       "      <td>q1_q2_wm_ratio</td>\n",
       "      <td>4451</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>77</th>\n",
       "      <td>wmd</td>\n",
       "      <td>4419</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>char_len_ratio</td>\n",
       "      <td>4367</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>72</th>\n",
       "      <td>phrase_emb_mean_cityblock_log</td>\n",
       "      <td>4334</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "      <td>das_words_hamming</td>\n",
       "      <td>4255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>jaro_winkler</td>\n",
       "      <td>4251</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100</th>\n",
       "      <td>das_stops1_ratio</td>\n",
       "      <td>4234</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>jaccard_ix_norm_q1_2gram</td>\n",
       "      <td>4227</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>fuzz_token_sort_ratio</td>\n",
       "      <td>4222</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73</th>\n",
       "      <td>phrase_emb_mean_euclidean</td>\n",
       "      <td>4153</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>jaro</td>\n",
       "      <td>4125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>jaccard_ix_norm_q2_2gram</td>\n",
       "      <td>4118</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>fuzz_partial_ratio</td>\n",
       "      <td>4105</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>fuzz_partial_token_sort_ratio</td>\n",
       "      <td>3922</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>150</th>\n",
       "      <td>meph_unigram_all_jaccard</td>\n",
       "      <td>3722</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>147</th>\n",
       "      <td>meph_ratio_len1_len2</td>\n",
       "      <td>3710</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>tfidf_euclidean</td>\n",
       "      <td>3695</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>word_diff_ratio</td>\n",
       "      <td>3635</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>175</th>\n",
       "      <td>magic_comatrix_euclidean</td>\n",
       "      <td>3533</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>longer_char_len_log</td>\n",
       "      <td>3532</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>180</th>\n",
       "      <td>wordmatchshare_percent</td>\n",
       "      <td>3524</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>152</th>\n",
       "      <td>meph_bigram_jaccard</td>\n",
       "      <td>3506</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>88</th>\n",
       "      <td>abh_canberra_distance</td>\n",
       "      <td>3336</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>174</th>\n",
       "      <td>magic_comatrix_cosine</td>\n",
       "      <td>3321</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>jaccard_ix_norm_q2_5gram</td>\n",
       "      <td>3275</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>jaccard_ix_2gram</td>\n",
       "      <td>3266</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>jaccard_ix_norm_q1_5gram</td>\n",
       "      <td>3250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>85</th>\n",
       "      <td>abh_cosine_distance</td>\n",
       "      <td>3234</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>fuzz_ratio</td>\n",
       "      <td>3192</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>161</th>\n",
       "      <td>meph_trigram_tf_l2_euclidean</td>\n",
       "      <td>3179</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>fuzz_token_set_ratio</td>\n",
       "      <td>3037</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75</th>\n",
       "      <td>phrase_emb_normsum_cityblock_log</td>\n",
       "      <td>3009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>81</th>\n",
       "      <td>abh_fuzz_qratio</td>\n",
       "      <td>2964</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>shorter_char_len_log</td>\n",
       "      <td>2961</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>jaccard_ix_norm_q1_3gram</td>\n",
       "      <td>2913</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>71</th>\n",
       "      <td>phrase_emb_mean_cosine</td>\n",
       "      <td>2862</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>91</th>\n",
       "      <td>abh_braycurtis_distance</td>\n",
       "      <td>2862</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>172</th>\n",
       "      <td>magic_freq_q1_q2_ratio</td>\n",
       "      <td>2829</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>token_len_ratio</td>\n",
       "      <td>2790</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>107</th>\n",
       "      <td>das_len_q2</td>\n",
       "      <td>2785</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>153</th>\n",
       "      <td>meph_bigram_all_jaccard</td>\n",
       "      <td>2752</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>97</th>\n",
       "      <td>das_word_match_2root</td>\n",
       "      <td>2746</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>106</th>\n",
       "      <td>das_len_q1</td>\n",
       "      <td>2720</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>jaccard_ix_norm_q2_3gram</td>\n",
       "      <td>2710</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113</th>\n",
       "      <td>das_len_char_q2</td>\n",
       "      <td>2691</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>112</th>\n",
       "      <td>das_len_char_q1</td>\n",
       "      <td>2651</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>90</th>\n",
       "      <td>abh_minkowski_distance</td>\n",
       "      <td>2650</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>86</th>\n",
       "      <td>abh_cityblock_distance</td>\n",
       "      <td>2629</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>jaccard_ix_norm_q1_4gram</td>\n",
       "      <td>2603</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>156</th>\n",
       "      <td>meph_trigram_all_jaccard</td>\n",
       "      <td>2595</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>jaccard_ix_5gram</td>\n",
       "      <td>2584</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>longer_token_len_log</td>\n",
       "      <td>2491</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>170</th>\n",
       "      <td>magic_freq_q1</td>\n",
       "      <td>2468</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>jaccard_ix_norm_q2_4gram</td>\n",
       "      <td>2460</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>155</th>\n",
       "      <td>meph_trigram_jaccard</td>\n",
       "      <td>2427</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>171</th>\n",
       "      <td>magic_freq_q2</td>\n",
       "      <td>2380</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>char_len_diff_log</td>\n",
       "      <td>2360</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>76</th>\n",
       "      <td>phrase_emb_normsum_euclidean</td>\n",
       "      <td>2344</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>66</th>\n",
       "      <td>pos_tag_euclidean</td>\n",
       "      <td>2321</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>80</th>\n",
       "      <td>abh_common_words</td>\n",
       "      <td>2207</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>114</th>\n",
       "      <td>das_diff_len_char</td>\n",
       "      <td>2169</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>jaccard_ix_4gram</td>\n",
       "      <td>2139</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>jaccard_ix_3gram</td>\n",
       "      <td>2136</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>151</th>\n",
       "      <td>meph_unigram_all_jaccard_max</td>\n",
       "      <td>2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>148</th>\n",
       "      <td>meph_log_ratio_len1_len2</td>\n",
       "      <td>1967</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>173</th>\n",
       "      <td>magic_freq_q2_q1_ratio</td>\n",
       "      <td>1927</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>82</th>\n",
       "      <td>abh_fuzz_WRatio</td>\n",
       "      <td>1904</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74</th>\n",
       "      <td>phrase_emb_normsum_cosine</td>\n",
       "      <td>1871</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>shorter_token_len_log</td>\n",
       "      <td>1808</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111</th>\n",
       "      <td>das_diff_caps</td>\n",
       "      <td>1754</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>89</th>\n",
       "      <td>abh_euclidean_distance</td>\n",
       "      <td>1710</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>115</th>\n",
       "      <td>das_len_word_q1</td>\n",
       "      <td>1697</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>54</th>\n",
       "      <td>pos_q2_verb</td>\n",
       "      <td>1663</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>116</th>\n",
       "      <td>das_len_word_q2</td>\n",
       "      <td>1636</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>109</th>\n",
       "      <td>das_caps_count_q1</td>\n",
       "      <td>1617</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>154</th>\n",
       "      <td>meph_bigram_all_jaccard_max</td>\n",
       "      <td>1442</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>pos_q1_noun</td>\n",
       "      <td>1440</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>110</th>\n",
       "      <td>das_caps_count_q2</td>\n",
       "      <td>1414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>157</th>\n",
       "      <td>meph_trigram_all_jaccard_max</td>\n",
       "      <td>1403</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>51</th>\n",
       "      <td>pos_q2_noun</td>\n",
       "      <td>1396</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>pos_q1_verb</td>\n",
       "      <td>1368</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52</th>\n",
       "      <td>pos_q2_propn</td>\n",
       "      <td>1364</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>78</th>\n",
       "      <td>q1_q2_intersect</td>\n",
       "      <td>1312</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>99</th>\n",
       "      <td>das_shared_count</td>\n",
       "      <td>1310</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>117</th>\n",
       "      <td>das_diff_len_word</td>\n",
       "      <td>1152</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>108</th>\n",
       "      <td>das_diff_len</td>\n",
       "      <td>1122</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>pos_q1_propn</td>\n",
       "      <td>1107</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>67</th>\n",
       "      <td>ner_tag_euclidean</td>\n",
       "      <td>1068</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>pos_q1_num</td>\n",
       "      <td>1046</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>token_len_diff_log</td>\n",
       "      <td>1037</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>pos_q1_adj</td>\n",
       "      <td>1035</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>pos_q2_adj</td>\n",
       "      <td>993</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>53</th>\n",
       "      <td>pos_q2_num</td>\n",
       "      <td>937</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>pos_q1_adv</td>\n",
       "      <td>911</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>pos_q2_adv</td>\n",
       "      <td>864</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>142</th>\n",
       "      <td>whq_count_q1</td>\n",
       "      <td>856</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>143</th>\n",
       "      <td>whq_count_q2</td>\n",
       "      <td>784</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>145</th>\n",
       "      <td>meph_abs_diff_len1_len2</td>\n",
       "      <td>626</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>68</th>\n",
       "      <td>ner_tag_count_diff</td>\n",
       "      <td>589</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>144</th>\n",
       "      <td>whq_count_diff</td>\n",
       "      <td>580</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>ner_q1_cardinal</td>\n",
       "      <td>461</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>ner_q1_person</td>\n",
       "      <td>387</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>146</th>\n",
       "      <td>meph_log_abs_diff_len1_len2</td>\n",
       "      <td>386</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>64</th>\n",
       "      <td>ner_q2_cardinal</td>\n",
       "      <td>365</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>141</th>\n",
       "      <td>das_why_both</td>\n",
       "      <td>361</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>ner_q1_org</td>\n",
       "      <td>343</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57</th>\n",
       "      <td>ner_q2_org</td>\n",
       "      <td>335</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>55</th>\n",
       "      <td>ner_q2_gpe</td>\n",
       "      <td>335</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>139</th>\n",
       "      <td>das_q1_why</td>\n",
       "      <td>313</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>ner_q1_gpe</td>\n",
       "      <td>308</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>124</th>\n",
       "      <td>das_q1_what</td>\n",
       "      <td>279</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>122</th>\n",
       "      <td>das_q2_how</td>\n",
       "      <td>269</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>59</th>\n",
       "      <td>ner_q2_person</td>\n",
       "      <td>268</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58</th>\n",
       "      <td>ner_q2_norp</td>\n",
       "      <td>264</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>ner_q1_norp</td>\n",
       "      <td>262</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>126</th>\n",
       "      <td>das_what_both</td>\n",
       "      <td>259</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>121</th>\n",
       "      <td>das_q1_how</td>\n",
       "      <td>253</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>128</th>\n",
       "      <td>das_q2_which</td>\n",
       "      <td>250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>140</th>\n",
       "      <td>das_q2_why</td>\n",
       "      <td>230</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>123</th>\n",
       "      <td>das_how_both</td>\n",
       "      <td>225</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>61</th>\n",
       "      <td>ner_q2_date</td>\n",
       "      <td>189</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>125</th>\n",
       "      <td>das_q2_what</td>\n",
       "      <td>182</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>127</th>\n",
       "      <td>das_q1_which</td>\n",
       "      <td>159</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>133</th>\n",
       "      <td>das_q1_where</td>\n",
       "      <td>151</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>129</th>\n",
       "      <td>das_which_both</td>\n",
       "      <td>142</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>137</th>\n",
       "      <td>das_q2_when</td>\n",
       "      <td>116</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>ner_q1_date</td>\n",
       "      <td>115</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>ner_q1_quantity</td>\n",
       "      <td>115</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>134</th>\n",
       "      <td>das_q2_where</td>\n",
       "      <td>114</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>130</th>\n",
       "      <td>das_q1_who</td>\n",
       "      <td>105</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>132</th>\n",
       "      <td>das_who_both</td>\n",
       "      <td>98</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>131</th>\n",
       "      <td>das_q2_who</td>\n",
       "      <td>98</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>87</th>\n",
       "      <td>abh_jaccard_distance</td>\n",
       "      <td>92</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>ner_q1_product</td>\n",
       "      <td>76</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>138</th>\n",
       "      <td>das_when_both</td>\n",
       "      <td>72</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>136</th>\n",
       "      <td>das_q1_when</td>\n",
       "      <td>54</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>60</th>\n",
       "      <td>ner_q2_product</td>\n",
       "      <td>50</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>ner_q1_loc</td>\n",
       "      <td>47</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>135</th>\n",
       "      <td>das_where_both</td>\n",
       "      <td>33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>56</th>\n",
       "      <td>ner_q2_loc</td>\n",
       "      <td>19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>63</th>\n",
       "      <td>ner_q2_quantity</td>\n",
       "      <td>18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>ner_q1_time</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>62</th>\n",
       "      <td>ner_q2_time</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               column  importance\n",
       "169                       pagerank_q2        8535\n",
       "177            oofp_nn_cnn_with_magic        8472\n",
       "168                       pagerank_q1        8085\n",
       "176            oofp_nn_mlp_with_magic        7991\n",
       "179    oofp_nn_siamese_lstm_attention        7851\n",
       "178        oofp_nn_bi_lstm_with_magic        7512\n",
       "162               meph_m_q1_q2_tf_oof        7487\n",
       "167                  image_similarity        7450\n",
       "164              meph_m_q1_q2_tf_svd1        7022\n",
       "166          meph_m_diff_q1_q2_tf_oof        6829\n",
       "163              meph_m_q1_q2_tf_svd0        6710\n",
       "165        meph_m_q1_q2_tf_svd100_oof        6597\n",
       "103                        das_cosine        6476\n",
       "22                jaccard_ix_diff_3_4        6436\n",
       "70           wordnet_similarity_brown        6400\n",
       "23                jaccard_ix_diff_4_5        6394\n",
       "95                      abh_kur_q2vec        6375\n",
       "98               das_tfidf_word_match        6263\n",
       "93                     abh_skew_q2vec        6076\n",
       "119                 das_avg_word_len2        6027\n",
       "120                 das_diff_avg_word        6013\n",
       "94                      abh_kur_q1vec        5933\n",
       "21                jaccard_ix_diff_2_3        5890\n",
       "65                     pos_tag_cosine        5876\n",
       "118                 das_avg_word_len1        5823\n",
       "84                       abh_norm_wmd        5755\n",
       "92                     abh_skew_q1vec        5676\n",
       "158         meph_trigram_tfidf_cosine        5602\n",
       "96                     das_word_match        5468\n",
       "69             wordnet_similarity_raw        5310\n",
       "159   meph_trigram_tfidf_l2_euclidean        5257\n",
       "105                  das_diff_stops_r        5214\n",
       "160   meph_trigram_tfidf_l1_euclidean        5173\n",
       "83                            abh_wmd        4991\n",
       "102                  das_shared_2gram        4856\n",
       "181              wordmatchshare_tfidf        4782\n",
       "31                       tfidf_cosine        4765\n",
       "149              meph_unigram_jaccard        4713\n",
       "101                  das_stops2_ratio        4527\n",
       "79                     q1_q2_wm_ratio        4451\n",
       "77                                wmd        4419\n",
       "3                      char_len_ratio        4367\n",
       "72      phrase_emb_mean_cityblock_log        4334\n",
       "104                 das_words_hamming        4255\n",
       "30                       jaro_winkler        4251\n",
       "100                  das_stops1_ratio        4234\n",
       "10           jaccard_ix_norm_q1_2gram        4227\n",
       "26              fuzz_token_sort_ratio        4222\n",
       "73          phrase_emb_mean_euclidean        4153\n",
       "29                               jaro        4125\n",
       "11           jaccard_ix_norm_q2_2gram        4118\n",
       "25                 fuzz_partial_ratio        4105\n",
       "28      fuzz_partial_token_sort_ratio        3922\n",
       "150          meph_unigram_all_jaccard        3722\n",
       "147              meph_ratio_len1_len2        3710\n",
       "32                    tfidf_euclidean        3695\n",
       "8                     word_diff_ratio        3635\n",
       "175          magic_comatrix_euclidean        3533\n",
       "1                 longer_char_len_log        3532\n",
       "180            wordmatchshare_percent        3524\n",
       "152               meph_bigram_jaccard        3506\n",
       "88              abh_canberra_distance        3336\n",
       "174             magic_comatrix_cosine        3321\n",
       "20           jaccard_ix_norm_q2_5gram        3275\n",
       "9                    jaccard_ix_2gram        3266\n",
       "19           jaccard_ix_norm_q1_5gram        3250\n",
       "85                abh_cosine_distance        3234\n",
       "24                         fuzz_ratio        3192\n",
       "161      meph_trigram_tf_l2_euclidean        3179\n",
       "27               fuzz_token_set_ratio        3037\n",
       "75   phrase_emb_normsum_cityblock_log        3009\n",
       "81                    abh_fuzz_qratio        2964\n",
       "0                shorter_char_len_log        2961\n",
       "13           jaccard_ix_norm_q1_3gram        2913\n",
       "71             phrase_emb_mean_cosine        2862\n",
       "91            abh_braycurtis_distance        2862\n",
       "172            magic_freq_q1_q2_ratio        2829\n",
       "7                     token_len_ratio        2790\n",
       "107                        das_len_q2        2785\n",
       "153           meph_bigram_all_jaccard        2752\n",
       "97               das_word_match_2root        2746\n",
       "106                        das_len_q1        2720\n",
       "14           jaccard_ix_norm_q2_3gram        2710\n",
       "113                   das_len_char_q2        2691\n",
       "112                   das_len_char_q1        2651\n",
       "90             abh_minkowski_distance        2650\n",
       "86             abh_cityblock_distance        2629\n",
       "16           jaccard_ix_norm_q1_4gram        2603\n",
       "156          meph_trigram_all_jaccard        2595\n",
       "18                   jaccard_ix_5gram        2584\n",
       "5                longer_token_len_log        2491\n",
       "170                     magic_freq_q1        2468\n",
       "17           jaccard_ix_norm_q2_4gram        2460\n",
       "155              meph_trigram_jaccard        2427\n",
       "171                     magic_freq_q2        2380\n",
       "2                   char_len_diff_log        2360\n",
       "76       phrase_emb_normsum_euclidean        2344\n",
       "66                  pos_tag_euclidean        2321\n",
       "80                   abh_common_words        2207\n",
       "114                 das_diff_len_char        2169\n",
       "15                   jaccard_ix_4gram        2139\n",
       "12                   jaccard_ix_3gram        2136\n",
       "151      meph_unigram_all_jaccard_max        2016\n",
       "148          meph_log_ratio_len1_len2        1967\n",
       "173            magic_freq_q2_q1_ratio        1927\n",
       "82                    abh_fuzz_WRatio        1904\n",
       "74          phrase_emb_normsum_cosine        1871\n",
       "4               shorter_token_len_log        1808\n",
       "111                     das_diff_caps        1754\n",
       "89             abh_euclidean_distance        1710\n",
       "115                   das_len_word_q1        1697\n",
       "54                        pos_q2_verb        1663\n",
       "116                   das_len_word_q2        1636\n",
       "109                 das_caps_count_q1        1617\n",
       "154       meph_bigram_all_jaccard_max        1442\n",
       "35                        pos_q1_noun        1440\n",
       "110                 das_caps_count_q2        1414\n",
       "157      meph_trigram_all_jaccard_max        1403\n",
       "51                        pos_q2_noun        1396\n",
       "38                        pos_q1_verb        1368\n",
       "52                       pos_q2_propn        1364\n",
       "78                    q1_q2_intersect        1312\n",
       "99                   das_shared_count        1310\n",
       "117                 das_diff_len_word        1152\n",
       "108                      das_diff_len        1122\n",
       "36                       pos_q1_propn        1107\n",
       "67                  ner_tag_euclidean        1068\n",
       "37                         pos_q1_num        1046\n",
       "6                  token_len_diff_log        1037\n",
       "33                         pos_q1_adj        1035\n",
       "49                         pos_q2_adj         993\n",
       "53                         pos_q2_num         937\n",
       "34                         pos_q1_adv         911\n",
       "50                         pos_q2_adv         864\n",
       "142                      whq_count_q1         856\n",
       "143                      whq_count_q2         784\n",
       "145           meph_abs_diff_len1_len2         626\n",
       "68                 ner_tag_count_diff         589\n",
       "144                    whq_count_diff         580\n",
       "48                    ner_q1_cardinal         461\n",
       "43                      ner_q1_person         387\n",
       "146       meph_log_abs_diff_len1_len2         386\n",
       "64                    ner_q2_cardinal         365\n",
       "141                      das_why_both         361\n",
       "41                         ner_q1_org         343\n",
       "57                         ner_q2_org         335\n",
       "55                         ner_q2_gpe         335\n",
       "139                        das_q1_why         313\n",
       "39                         ner_q1_gpe         308\n",
       "124                       das_q1_what         279\n",
       "122                        das_q2_how         269\n",
       "59                      ner_q2_person         268\n",
       "58                        ner_q2_norp         264\n",
       "42                        ner_q1_norp         262\n",
       "126                     das_what_both         259\n",
       "121                        das_q1_how         253\n",
       "128                      das_q2_which         250\n",
       "140                        das_q2_why         230\n",
       "123                      das_how_both         225\n",
       "61                        ner_q2_date         189\n",
       "125                       das_q2_what         182\n",
       "127                      das_q1_which         159\n",
       "133                      das_q1_where         151\n",
       "129                    das_which_both         142\n",
       "137                       das_q2_when         116\n",
       "45                        ner_q1_date         115\n",
       "47                    ner_q1_quantity         115\n",
       "134                      das_q2_where         114\n",
       "130                        das_q1_who         105\n",
       "132                      das_who_both          98\n",
       "131                        das_q2_who          98\n",
       "87               abh_jaccard_distance          92\n",
       "44                     ner_q1_product          76\n",
       "138                     das_when_both          72\n",
       "136                       das_q1_when          54\n",
       "60                     ner_q2_product          50\n",
       "40                         ner_q1_loc          47\n",
       "135                    das_where_both          33\n",
       "56                         ner_q2_loc          19\n",
       "63                    ner_q2_quantity          18\n",
       "46                        ner_q1_time           8\n",
       "62                        ner_q2_time           6"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame({\n",
    "    'column': list(df_train.columns),\n",
    "    'importance': model.feature_importance(),\n",
    "}).sort_values(by='importance', ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_cv_score = np.mean(cv_scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Final CV score: 0.18994530716077662\n"
     ]
    }
   ],
   "source": [
    "print('Final CV score:', final_cv_score)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate submission"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_test = np.mean(y_test_pred, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission_id = datetime.datetime.now().strftime('%Y-%m-%d-%H%M')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_submission = pd.DataFrame({\n",
    "    'test_id': range(len(y_test)),\n",
    "    'is_duplicate': y_test\n",
    "})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Recalibrate predictions for a different target balance on test"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Based on [Mike Swarbrick Jones' blog](https://swarbrickjones.wordpress.com/2017/03/28/cross-entropy-and-training-test-class-imbalance/)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "$\\alpha = \\frac{p_{test}}{p_{train}}$\n",
    "\n",
    "$\\beta = \\frac{1 - p_{test}}{1 - p_{train}}$\n",
    "\n",
    "$\\hat{y}_{test}^{\\prime} = \\frac{\\alpha \\hat{y}_{test}}{\\alpha \\hat{y}_{test} + \\beta(1 - \\hat{y}_{test})}$"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Training set balance is 36.92%, test set balance is ~16.5%."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "def recalibrate_prediction(pred, train_pos_ratio=0.3692, test_pos_ratio=0.165):\n",
    "    a = test_pos_ratio / train_pos_ratio\n",
    "    b = (1 - test_pos_ratio) / (1 - train_pos_ratio)\n",
    "    return a * pred / (a * pred + b * (1 - pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_submission['is_duplicate'] = df_submission['is_duplicate'].map(recalibrate_prediction)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_submission = df_submission[['test_id', 'is_duplicate']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Explore and save submission"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x10921dda0>"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAdIAAAEPCAYAAAD2wEXHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAHfpJREFUeJzt3Xu8HWV56PHfE4hJKGQHCNUEeqTiBSitcKTcRVqxVikUK1g/VPFGsfZUwdqKt4otrUKhBXuxR8Ua0eNRDBWoSPXITUEhKkSLlSJUvJAUCTGbBNghZD/nj5kVFytr7yQzazGzdn7fz2c+a+133nnX82Zfnrwz77wTmYkkSapmVtMBSJI0ykykkiTVYCKVJKkGE6kkSTWYSCVJqsFEKklSDSZSSZJqMJFKklSDiVSSpBpMpJIk1WAilSSpBhOpJEk1mEglSarBRCpJUg07Nh2ABiMiHqP4j9GDTcciSSNkPjCZmZXzYfg80pkhIiaBGBsbazoUSRoZ4+PjAJmZlc/QOiKdOR4cGxsbW7NmTdNxSNLIWLBgAePj47XO5HmNVJKkGkykkiTVYCKVJKkGE6kkSTWYSCVJqsFZu5I0Q2Qmq1atYmJigsnJyabDacysWbPYcccdmT9/Pj/3cz839M8zkUrSDJCZ3Hvvvaxdu5Y5c+awww47NB1SYzZs2MDDDz/MmjVr2GWXXVi8eDGzZg3vBKyJVJJmgFWrVrF27Vqe/OQns9tuuzUdTuMmJyd54IEHWLVqFePj4+y6665D+ywTqQDY+21XNfbZ95x7XGOfLc0UExMTzJkzxyRamjVrFgsXLuTBBx9k3bp1Q02kTjaSpBlgcnJyuz6d209EsOOOOw79erGJVJKkGkykkiTVYCKVJKkGE6kkSTWYSCVJrbd+/XrOOussFi9ezLx58zjssMO45pprmg4L8PYXSZrxmry9raPubW6vfvWrueyyyzjzzDN5+tOfzpIlS3jRi17EDTfcwOGHHz6gKKsxkUqSWm3ZsmV86lOf4sILL+TMM88E4NRTT+WAAw7grLPO4stf/nKj8XlqV5LUakuXLmX27Nmcdtppm8rmzp3L6173Om688UZWrlzZYHQmUklSy912223su+++7Lzzzo8rP+SQQ8hMli9f3lBkBROpJKnVVq5cyaJFizYr75StWLHiiQ7pcUykkqRWe+SRR5gzZ85m5XPnzt20v0kmUklSq82bN4/169dvVj4xMbFpf5NMpJKkVlu0aFHfCUWdssWLFz/RIT2OiVSS1GoHHnggd9xxB+vWrXtc+S233ALAs5/97CbC2sREKklqtZNOOokNGzZw8cUXbypbv349H/3oRznyyCMbH5G6IIMkqdUOPfRQTj75ZN761reycuVK9tlnHz72sY/xgx/8gCVLljQdXrMj0ohYFBHnRsR1EbE2IjIijulT755yX+92bp+6CyLiQxFxf0Q8FBHXRsSBU3z+CRFxa0RMRMQPI+LsiNjsPxdNtylJ27tLLrmEM844g0suuYQ3velNbNiwgc9//vMceeSRTYfW+Ij0WcBZwF3At4Ejpqn7TeCinrLbu7+IiFnAVcAvAxcADwB/CFwfEc/JzLu76r4IuBy4Fnhjecy7gYXl161oU5LqqrvObRvMnTuX888/n/PPP7/pUDbTdCL9JrAwMx+IiBOBz05T98eZ+YkttHcSRTJ+SWZeDhARlwJ3AmcDp3bVvQC4DXhhZm4s6z4IvD0i/i4zv9eSNiVJLdboqd3MXJuZD2xt/YiYExE7TVPlJGAFcEXXZ9wPXAqcGBGzy3b2B/YHPthJeKUPUPybvLQNbUqS2m+UZu3+BvAQ8FBE3B0Rp/epcxDwzczMnvJlwC7A07vqAXyju1JmrgB+3LW/6TY3iYg1023AWO8xkqThG5VE+m2KU54vBX4fWAV8MCLe1lNvEdDvMQCdssVd9Zimbvdc6ibblCS1XNPXSLdKZp7Q/XVEfBS4EfiziPinzBwvd80DNl9HCia69ne/TlW3+/Rxk21ukpkL+tTfxFGpJDVjVEakj1Neg7yIIjl1Pxr9EWDzlY1hbtf+7tep6navgNxkm5KklhvJRFr6Ufm6W1fZSn52irVbp2xFVz2mqdv9TJ4m25SkrTJr1iw2bty45YrbkczkscceY9as4aa6UU6kTytf7+8qWw48JyKip+6hwDqK+1U79QAO7q4UEYuBvbr2N92mJG2VuXPnsn79elavXt10KK0wOTnJ/fffz6OPPrrZA8EHrfXXSCNiN2BNZk52lc0F/hRYC3ytq/pSiltLfptiYQQiYiFwMnBFZm4AyMzvRMQdwOkR8ZGu21XeAEwCl7WhTUnaWgsXLmT9+vXcd999rFmzhh122KHpkBqzceNGNmzYwOTkJPPnz2dsbLjTRxpPpBHxrvLtfuXrKyPiKIrk+Q/ACcA7I2IpcA+wO/Aq4JnAGzKz+3EAS4GbgUsi4gKK2b1/SDHyfk/PR/8pcCXwhYj4NHAA8EcU94He2aI2JWmLIoI999yTVatWMTExweTk5JYPmqFmz57NvHnzGBsbY6edplt6YDBi81sZn1gRMVUAP8jMvSPiORTJ5SBgD4rZrrcCF2Tm5/q0tytwPnAixezXZcBbMvPWPnVPpLitZj+KU8T/DJyTmY+1qc2tERFrxsbGxtasWVPlcPZ+21WVjhuEmbB8maTRtGDBAsbHx8e3dGfEdBpPpBoME6kkbbtBJNJRnmwkSVLjTKSSJNVgIpUkqQYTqSRJNZhIJUmqwUQqSVINJlJJkmowkUqSVIOJVJKkGkykkiTVYCKVJKkGE6kkSTWYSCVJqsFEKklSDSZSSZJqMJFKklSDiVSSpBpMpJIk1WAilSSpBhOpJEk1mEglSarBRCpJUg0mUkmSatjmRBoR342It0TEHsMISJKkUVJlRBrA+cCPI2JpRPxmRMSA45IkaSRscyLNzH2Bo4FPAi8ErgJ+EBF/HhFPHXB8kiS1WqVrpJl5Y2a+BlgE/AGwAvgz4O6I+GJEvCwiZg8wTkmSWqnWZKPMXJeZH87Mw4ADgEuBY4H/C6yIiAsiYq8BxClJUivVnrUbEbMi4njgvcDJZfFXgFuBNwN3RMRv1f0cSZLaqHIijYhnRMT7gB8BVwBHAO8H9s3MYzLzhcD+wF3ABYMIVpKkttlxWw+IiFcBrwWOKouuB/4Y+JfM3NBdNzP/MyIuAj5cM05JklppmxMp8FHgfopR5ocz864t1P8u8KkKnyNJUutVSaQvA67oHX1OJTNvAW6p8DmSJLXeNifSzFw6jEAkSRpFVZYIfHdELJ9m/20R8fZ6YUmSNBqqzNp9KcUEo6lcR3H6V5KkGa9KIv1FiglEU/lP4GnVwpEkabRUXbR+wTT7x4AdqoUjSdJoqZJI/wM4fpr9x1OMSiVJmvGqJNJ/Bo6IiI9ExG6dwojYLSIupljh6J8HFaAkSW1W5faXD0bErwGvAV4VET8ud+1FkZiXZuY/DjBGSZJaq+pj1F4OvAL4ArC+3K4GTslMZ+xKkrYbVVY2AiAzP0nxcG9JkrZbtR+jJknS9qzSiDQidgJ+F3gGsDvFLTHdMjNfXzM2SZJar8pj1A4GPgfsweYJtCMBE6kkacarcmr3QmAe8HvAU4DZfbYnDSpASZLarEoiPRj4m8z8VGb+JDM39tu2pqGIWBQR50bEdRGxNiIyIo6Zou4JEXFrRExExA8j4uyI2GxEHRELIuJDEXF/RDwUEddGxIGj3KYkqb2qJNK1FA/2HoRnAWdR3IP67akqRcSLgMuB1cAby/fvphgdd9ebBVwFvBz4e+CtwJOB6yNin1FsU5LUblUmG10O/AbwTwP4/G8CCzPzgYg4EfjsFPUuAG4DXtgZ7UbEg8DbI+LvMvN7Zb2TKFZWeklmXl7WuxS4EzgbOHUE25QktViVEelbgT0j4sKIeGqdD8/MtZn5wHR1ImJ/YH/ggz2njD9AEf9Lu8pOAlYAV3R9xv3ApcCJETF7lNqUJLVflUR6P3AQ8CbgvyJiQ0Q82rOtH2CMB5Wv3+guzMwVwI+79nfqfjMzs6eNZcAuwNNHrE1JUstVObX7aYrbW54oi8rXlX32rQQW99S9dop6lHW/O0JtbhIRa/rU7za2hf2SpCGosmj9K4YRyDTmla/9RrkTwE49daeq193WqLQpSWq5ymvtPoEeKV/n9Nk3t2t/p+5U9brbGpU2N8nM6R6m3hmxOiqVpCdYpbV2I2JWRJwSEUsi4uqIeHZZvqAsX7ylNrZB53Tnoj77FlFM2umuO1U9uuqOSpuSpJbb5kQaEfOA64BPAC+juBVm93L3OuBvgT8YVIDA8vL14J44FlPcf7q8p+5zIqJ36cJDy9juGrE2JUktV2VE+h7gMOBkYG+61tvNzMeAfwF+cwCxddr8DnAHcHpE7NC16w3AJHBZV9lSiok6v90piIiFZaxXZOaGUWpTktR+Va6Rngx8KDMvi4jd++z/HsV9klslIt5Vvt2vfH1lRBwFrMnMfyjL/hS4EvhCRHwaOAD4I4p7Nu/sam4pcDNwSURcAKwC/pDiPwzv6fnoUWlTktRiVRLpnsC3ptn/EDB/G9o7p+fr15avPwD+ASAzPxcRv0Ox6s/fU9zL+pe9x2bmxoh4MXA+xX2u8yjuzTw1M+/qqTsSbUqS2q1KIl1N/4kyHfvT/17KvjJzqkex9da7nGJ5wi3V+ylwWrnNiDYlSe1V5RrptcBryklHj1MuGfha4At1A5MkaRRUSaR/TjFLdxlwOsUqRy+IiHOAW4ENwHsHFqEkSS22zYm0nDTzAorZun9Vvp4FvBP4b+AFmfnDQQYpSVJbVVrZKDOXAQeUD6LejyKZfg/4Rp+F2CVJmrFqLRGYmct5/EIDkiRtVyotEShJkgrbPCKNiA1s+TFqmZn9FmWXJGlGGdTzSHcE9qFYZ/bbwL/XjEuSpJEw0OeRRsTRFGvtnl4nKEmSRsVAr5Fm5peBJcBfD7JdSZLaahiTje6k51FikiTNVMNIpM8FJobQriRJrVNl1u4pU+zaDTgWOB74aJ2gJEkaFVVm7X6CYtZuv6e2bAQ+Bry5TlCSJI2KKon0BX3KkuLxav+VmQ/WC0mSpNFR5faXa4YRiCRJo8glAiVJqqHKZKMPVficzMzXVzhOkqRWq3KN9DR+tkRg74Sj6cpNpJKkGafKqd3FFI9O+xxwNLCw3J4HXAXcBiwCZndtTxpEsJIktU2VRHousCozfzszb8zM1eX2lcw8gWL27nmZubF7G2zYkiS1Q5VE+lvAldPsv6KsI0nSjFclkc6lOL07lT3LOpIkzXhVEulXgTdGxBG9OyLiSOCNZR1Jkma8KrN2/xj4CvCViLgZuKMs3xc4DFgLvGUw4UmS1G5VVja6PSIOBt4HHAccXu56BLgMeEdm3jW4ECVJaq8qI1Iy827gZRGxA/CUsvi/nZ0rSdreVEqkHWXivHdAsUiSNHIqrbUbETtHxDsi4vqI+G5EHFaWLyzLnznYMCVJaqcqa+3uDtwIPAP4PvA0YCeAzFwVEadRPOT7TwYYpyRJrVTl1O5fUtwrejhFIv1Jz/7LgWNrxiVJ0kiocmr3eOADmfl1frZIfbfvA79QKypJkkZElUS6B/C9afY/RnmqV5Kkma5KIr2P4rroVA4CflgtHEmSRkuVRPp54HUR8eTeHeVCDacy/aL2kiTNGFUS6V9QXBu9DTinfP+KiPg4xWze+ygetSZJ0oy3zYk0M1cAR1Ak0tcDAbwaOAW4DnhuZj4wwBglSWqtqksEfh84LiJ2A55FkUzvyszeW2EkSZrRtimRRsTOwN8CX8zMpZm5GvjaUCKTJGkEbNOp3cxcB7wSGBtOOJIkjZYqk43+A3jqoAORJGkUVUmk5wNviIh9Bh2MJEmjpspko6cBPwZuj4grKVY5erinTmbm++oGJ0lS21VdtL7j5CnqJGAilSTNeFUS6TMGHoUkSSNqqxJpRBxCcZ/o6sy8e8gxSZI0MrZ2stHXgN/sfBERO0fEJyNi/+GEJUnSaNjaRBo9X88BXg48ZbDhTPHhEcdERE6x7dtT94iIuDEiHo6I/46I90fEZo91i4g5EXFeRKyIiEci4uaIeP4Un99Ym5Kkdqu0RGCDLgK+2VO2ovMmIg4ErgG+A/wxsBfwJxQzjY/vOW4J8NKyzbso1gu+OiKel5mbVmtqQZuSpBYbtUR6Q2ZePs3+9wIPAMeUqzAREfcAH46IX8/Ma8uyQyhG1G/OzIvKskuA24HzgKPb0KYkqf2qLMjQqIjYJSI2+w9ARMwHXgBc0klOpUuAdcDLuspOAjYAF3cKMnMC+AhwVEQsakmbkqSW25YR6YsjonNNdCeKe0VPLk9T9srMvLB2dJv7OLAz8FhEXAe8JTP/vdz3yxT9+UZPII9GxHLgoK7ig4A7ehIZwDKK68EHAitb0KYkqeW2JZGeUm7dXj9F3QQGmUgfBZYCVwOrgF+huKZ4Y0T8ambeCSwq667sc/xK4PCurxcB905RD2BxV70m29wkItb0K+/igwQkqQFbm0h/bahRbEFmfhX4alfRlRHxrxSjurOB3wPmlfvW92lioms/5fup6tFVt+k2JUktt1WJNDNvGHYg2yozvxURXwI6t5c8Ur7O6VN9btf+Tt2p6nW31XSbm2Tmgn7lHeWI1VGpJD3BRm6yUY8fAbuV7zunShf1qbeIrttkyrpT1aOrbtNtSpJabtQT6dOA+8v3twOPAQd3V4iIJ1FM9FneVbwc2Dcidu5p79Dy9VstaVOS1HIjkUgjYo8+ZUdRXLv9AkBmjgNfAl7Zk8xeSTHT9zNdZUuB2cBpXe3NAV4D3JSZK1rSpiSp5UZlQYZPR8TDFBOOVgEHAKeX79/TVe+dZZ3rI+JiihWD3gJcnZlf6lTKzFsi4jPAX5f3d94NvAp4KsVqRLShTUlS+43EiBS4HNiDItn8I8UyfJ8EfjUzf9iplJm3AsdSzIi9EPh94MP0f27qqcD7y9e/oxhNvjgzb+qu1II2JUktFpnZdAwagIhYMzY2NrZmzZZuN+1v77ddNeCItt495x7X2GdL2r4tWLCA8fHx8S3dGTGdURmRSpLUSiZSSZJqMJFKklSDiVSSpBpMpJIk1WAilSSpBhOpJEk1mEglSarBRCpJUg0mUkmSajCRSpJUg4lUkqQaTKSSJNVgIpUkqQYTqSRJNZhIJUmqwUQqSVINJlJJkmowkUqSVIOJVJKkGkykkiTVYCKVJKkGE6kkSTWYSCVJqsFEKklSDSZSSZJqMJFKklSDiVSSpBpMpJIk1WAilSSpBhOpJEk1mEglSarBRCpJUg07Nh2AtPfbrmrkc+8597hGPlfSzOKIVJKkGkykkiTVYCKVJKkGE6kkSTWYSCVJqsFEKklSDd7+ou1WU7fdgLfeSDOJI1JJkmowkUqSVIOndqUGuJqTNHM4IpUkqQZHpNJ2xJGwNHgmUklD5wxpzWQmUkkzmqPwJ872+h8mE2lDImIO8BfAK4FdgW8B78zMaxoNTNJAbK9JZXvkZKPmLAHeDHwCOAOYBK6OiMObDEqStG0ckTYgIg4BXg68OTMvKssuAW4HzgOObjA8SSOuydHw9sgRaTNOAjYAF3cKMnMC+AhwVEQsaiowSdK2cUTajIOAOzJzXU/5MiCAA4GV3TsiYs0W2hwbHx9nwYIFlQJ6cOKxSsdJUhss+N/V0tn4+DjA/DqfbSJtxiLg3j7lneS5uGK7OT4+/mCF48bK1/GKnzuK7PP2wT5vH8bG1wPV+jyfYo5KZSbSZswD1vcpn+ja/ziZWW2ouRU6o91hfkbb2Oftg33ePjTdZ6+RNuMRYE6f8rld+yVJI8BE2oyVFKd3e3XKVjyBsUiSajCRNmM5sG9E7NxTfmj5+q0nOB5JUkUm0mYsBWYDp3UKypWOXgPclJmOSCVpRDjZqAGZeUtEfAb46/Ke0buBVwFPBV7dZGySpG1jIm3OqcA55euuwLeBF2fmTY1GJUnaJpGZTceghjU9dbwJ9nn7YJ+3D0332UQqSVINTjaSJKkGE6kkSTWYSCVJqsFEKklSDSbSGSwi5kTEeRGxIiIeiYibI+L5W3nsnhFxaUSsiYgHI+LyiPjFYcdcV9U+R8TvRMSnI+L7EfFwRNwREedHxNiWjm1ane9zTzufj4iMiIuGEecg1e1zRJwSEcsi4qGIWB0RN0TEIcOMua6av8/HRsT1EfFARPw0Ir4WES8bdsx1RMSiiDg3Iq6LiLXlz+Yx23D8fhHxbxGxrvwefywiFg4jVhPpzLYEeDPwCeAMikcFXR0Rh093ULl04XXAc4G/As4G/idwfUTsOsyAB2AJFfoMfAjYD/g48CbgC+XrTRExd7oDW2AJ1fq8SUQcBxw9lOiGYwkV+xwRfwl8DLi9PPbPKRZFecqwgh2QJVT7ff4t4IsU6wacDfwZsBH4dES8bpgB1/Qs4CxgL4r77LdaROwFfBnYB3gHcAFwPPDFiJg94DghM91m4AYcAiRwZlfZXOAu4MtbOPatFL+kB3WV7Qs8BvxF030bUp+P6VN2atneq5vu2zD63FX/ScCdwLvLti5qul9D/D4fUf5sv6TpfjyBfb6a4vnHc7rK5pRlNzTdt2ni3gXYvXx/Ytn/Y7by2A8A64A9u8qOLdt47aBjdUQ6c50EbAAu7hRk5gTwEeCocmnC6Y69OTNv6zr2DuAaoM2ngyr3OTOv71P82fJ1vwHGOGh1vs8dZ1A8A/eCoUQ4eHX6fAbw9cz8bETM6vPgiLaq0+f5wE8zc9MzkMv3P6XFj2zMzLWZ+UDFw18KXJmZ93a19yWK/zAO/G+YiXTmOgi4IzPX9ZQvAwI4sN9BETEL+BXgG312LwOeGRE7DTLQAarU52l0TvWtqhvYENXqc0Q8heJU3zsy8+HhhDhwdfr8fODrEfFeYBxYGxH3RMTvDSfUganT5xuAX4qIcyJin3I7B3gm8DfDCbc5EbEn8PNM/TfsoEF/pmvtzlyLKE7d9FpZvi6e4rjdKE77rOyzbyXFL21nof22qdrnqZxFcS3pX+oENWR1+/w+4D8prruNikp9Lq/v7w68nOL7ehawGvhfwCci4uHM/Gy/Y1ugzvf5ryiuFb4TeFdZtg44ITP/38AibI/O6Hyqv2E/HxE7ZObGQX2giXTmmges71M+0bV/quOoeGzTqvZ5MxFxCvA64H2Z2cb/NHRU7nM5S/VU4HlZXkQaEVX73DmNuztwWGbeAhARn6W41vhufnY6v23q/Gyvpzil+RmK/u0AnA5cGhHPz8yvDzLQFtjav2G9o/vKTKQz1yMUI8tec7v2T3UcFY9tWtU+P05EPJfi2tNVFKc926xSnyMigPcDl2XmjUOKbVjq/mx/v5NEobheGBFLgTMiYuc+p0/boM7P9t9TTFb61cycBIiIS4HvABcBRw4wzjZ4wv+GeY105lrJz05xdOuUTfXw8NUU/5Ob6tik/ymTNqja500i4tnAlRTT7X93kKd/hqRqn19C8cf1nyJi785W7ptfft3WMw91f7bv67PvPorLFm29b7hSnyPiScBpwOc6SRQgMzdQzOY9JCJm2oCq8/dpqn+vnwz699pEOnMtB/btMyvx0PL1W/0OKn/Z/h04uM/uQ4HvtXhSSqU+d0TEPsC/AT8BjsvMhwYf4sBV7fP/oPj9vxb4ftcG8Jry/fMGG+rA1PnZXg7s2Wf3XhTXTVcPKsgBq/p93p3izOMOffbNLvfFQCJsiXKm7v30/xt2CMW/5UCZSGeupRS/KKd1CiJiDsUfyZsyc0VZ9j8iYt8+xx4WEQd1Hfss4NcprrO0VeU+l7NXv0hxj+ELM7PNM3W7Ve3zv1KMSns3gM+V728devTV1PnZ/gzwCxHxgq5j51PcEvHVzGzrZYuqff4JsAb4ne6FCMqEfDxwezk6HVmdmcg9xZcBJ5QzeDv1nk8xU3nwf8OavunWbXgbcCnwKHAexeSCm8qvj+yqc33xY/C443ahmHyxEvgT4Ezgh8CPKG+QbutWo8/LKU5bnwe8omc7vOl+DaPPU7TV+gUZan6fdwK+CzxIsaLRmRRnYB53bBu3Gn1+Z/l9/UbZ37cA/1GW/W7T/dpCn99Vbv+njPcj5dd/1FXnHuCenuN+geK2tTuBNwJvpzjbsBx40sDjbPofym14G8WF9fPLhDhBcQ/VsT11+v6BpTjV9RnKe+0orhs+rek+DavP5S/pVNuSpvs1rO9zn7ZGJZHW+dl+CsVSkKspJp3cCBzddJ+G3OdTgFsoFmF4GLiZEVjdaZrfyXu66myWSMvyX6JY6vOhst8fB/YYRpxRfqAkSarAa6SSJNVgIpUkqQYTqSRJNZhIJUmqwUQqSVINJlJJkmowkUqSVIOJVJKkGkykkiTVYCKVJKmG/w8iC3RQrA4iDgAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "pd.DataFrame(y_test).plot.hist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test duplicates with >0.9 confidence: 41721\n",
      "Test mean prediction: 0.1285997052576751\n",
      "Calibrated mean prediction: 0.07656442422713633\n"
     ]
    }
   ],
   "source": [
    "print('Test duplicates with >0.9 confidence:', len(df_submission[df_submission.is_duplicate > 0.9]))\n",
    "print('Test mean prediction:', np.mean(y_test))\n",
    "print('Calibrated mean prediction:', df_submission['is_duplicate'].mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not os.path.exists(project.submissions_dir):\n",
    "    os.makedirs(project.submissions_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_submission.to_csv(\n",
    "    project.submissions_dir + f'{submission_id}-submission-draft-cv-{final_cv_score:.6f}.csv',\n",
    "    header=True,\n",
    "    float_format='%.8f',\n",
    "    index=None,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
